{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "00cf0ca6-7f7a-47d7-b647-e796365cf383",
   "metadata": {},
   "source": [
    "## Import desired transforms, data_access, and the transform chain orchestrator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d690b830-a9d9-45df-828f-cfffd2d7ca67",
   "metadata": {},
   "outputs": [],
   "source": [
    "from data_processing.data_access import DataAccessLocal\n",
    "from dpk_docling2parquet import docling2parquet_contents_types, Docling2ParquetTransform\n",
    "from dpk_doc_chunk import DocChunkTransform\n",
    "from dpk_transform_chain import TransformsChain"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9e436b6c-80e0-40ba-b94f-f5e12051170f",
   "metadata": {},
   "source": [
    "## Define transform parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "9fff4ca4-979d-46ea-8feb-af8b39414188",
   "metadata": {},
   "outputs": [],
   "source": [
    "docling2parquet_params = {\"contents_type\": docling2parquet_contents_types.MARKDOWN}\n",
    "\n",
    "doc_chunk_params = {\"chunking_type\": \"li_markdown\",\n",
    "                    \"chunk_size_tokens\": 128,\n",
    "                    \"chunk_overlap_tokens\": 30,\n",
    "                    }\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "26c6371e-ee95-4c14-99c7-4ff877aae974",
   "metadata": {},
   "source": [
    "## Instantiate transforms components (fully compatible with existing transform logic)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "8dbb2e88-b00c-45c1-960e-bd7a5bd18da3",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "23:07:49 INFO - Initializing models\n"
     ]
    }
   ],
   "source": [
    "doc2parquet = Docling2ParquetTransform(docling2parquet_params)\n",
    "doc_chunk = DocChunkTransform(doc_chunk_params)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "784bdd98-c885-40e4-aacb-b17e09d48239",
   "metadata": {},
   "source": [
    "## Set up data access"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "a55e4c09-41e0-4302-b6a1-ed934ec927c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "da_config = {\n",
    "            \"config\": {\n",
    "                \"input_folder\": \"test-data/binary_input\",\n",
    "                \"output_folder\": \"test-data/binary_output\",\n",
    "            },\n",
    "            \"files_to_use\": [\".pdf\"]\n",
    "        }\n",
    "\n",
    "data_access = DataAccessLocal(**da_config)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3e50c28d-e0a2-4e75-92a2-e8067ec3ebc4",
   "metadata": {},
   "source": [
    "## Define sequence and run transform chain"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "98f1d787-0479-447f-82e8-d4871070edab",
   "metadata": {},
   "outputs": [],
   "source": [
    "orch = TransformsChain(\n",
    "            data_access=data_access,\n",
    "            transforms=[doc2parquet, doc_chunk]\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "ad1bac4b-3b91-4012-8895-403a2c57927f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "23:11:55 INFO - Processing all 1 files in one batch\n",
      "23:11:55 INFO - ['/Users/shalisha.witherspoonibm.com/Documents/DPK_CHAIN_UPDATE/data-prep-kit-outer/transforms/chain/test-data/binary_input/opea_project_github_io_latest_introduction_index_html-1.pdf']\n",
      "23:11:55 INFO - Processing file: /Users/shalisha.witherspoonibm.com/Documents/DPK_CHAIN_UPDATE/data-prep-kit-outer/transforms/chain/test-data/binary_input/opea_project_github_io_latest_introduction_index_html-1.pdf\n",
      "23:11:56 INFO - Finished processing and saved: /Users/shalisha.witherspoonibm.com/Documents/DPK_CHAIN_UPDATE/data-prep-kit-outer/transforms/chain/test-data/binary_output/opea_project_github_io_latest_introduction_index_html-1.parquet\n"
     ]
    }
   ],
   "source": [
    "orch.run()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0822c2c8-9432-47a0-aafa-7465db3cbac1",
   "metadata": {},
   "source": [
    "## Inspect Generated parquet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "1a3d3e39-dda4-457c-a953-8d6647ae700a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>filename</th>\n",
       "      <th>num_pages</th>\n",
       "      <th>num_tables</th>\n",
       "      <th>num_doc_elements</th>\n",
       "      <th>document_hash</th>\n",
       "      <th>ext</th>\n",
       "      <th>hash</th>\n",
       "      <th>size</th>\n",
       "      <th>date_acquired</th>\n",
       "      <th>document_convert_time</th>\n",
       "      <th>source_filename</th>\n",
       "      <th>source_document_id</th>\n",
       "      <th>contents</th>\n",
       "      <th>document_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>opea_project_github_io_latest_introduction_ind...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>66</td>\n",
       "      <td>16225671439394140203</td>\n",
       "      <td>pdf</td>\n",
       "      <td>45a605c585431f959465bebf225bf5c4a52b5ee9f466bd...</td>\n",
       "      <td>4384</td>\n",
       "      <td>2025-08-20T23:11:56.727852</td>\n",
       "      <td>1.617634</td>\n",
       "      <td>opea_project_github_io_latest_introduction_ind...</td>\n",
       "      <td>5f171c27-f29b-4e64-af91-5121f6540277</td>\n",
       "      <td>&lt;!-- image --&gt;</td>\n",
       "      <td>e5dc819883d3c4b39a4a3f3054652d05bbceb23ae07f1d...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>opea_project_github_io_latest_introduction_ind...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>66</td>\n",
       "      <td>16225671439394140203</td>\n",
       "      <td>pdf</td>\n",
       "      <td>45a605c585431f959465bebf225bf5c4a52b5ee9f466bd...</td>\n",
       "      <td>4384</td>\n",
       "      <td>2025-08-20T23:11:56.727852</td>\n",
       "      <td>1.617634</td>\n",
       "      <td>opea_project_github_io_latest_introduction_ind...</td>\n",
       "      <td>5f171c27-f29b-4e64-af91-5121f6540277</td>\n",
       "      <td>##  OPEA Overview\\n\\n-  OPEA Project Archite...</td>\n",
       "      <td>2976a73d5b22c5c85fe475b86408966158467b1fa7854d...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>opea_project_github_io_latest_introduction_ind...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>66</td>\n",
       "      <td>16225671439394140203</td>\n",
       "      <td>pdf</td>\n",
       "      <td>45a605c585431f959465bebf225bf5c4a52b5ee9f466bd...</td>\n",
       "      <td>4384</td>\n",
       "      <td>2025-08-20T23:11:56.727852</td>\n",
       "      <td>1.617634</td>\n",
       "      <td>opea_project_github_io_latest_introduction_ind...</td>\n",
       "      <td>5f171c27-f29b-4e64-af91-5121f6540277</td>\n",
       "      <td>## Getting Started with OPEA\\n\\nOPEA Tutoria l...</td>\n",
       "      <td>005ca58cb03214fec4e6447dd4ea411f9f300b3373e7db...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>opea_project_github_io_latest_introduction_ind...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>66</td>\n",
       "      <td>16225671439394140203</td>\n",
       "      <td>pdf</td>\n",
       "      <td>45a605c585431f959465bebf225bf5c4a52b5ee9f466bd...</td>\n",
       "      <td>4384</td>\n",
       "      <td>2025-08-20T23:11:56.727852</td>\n",
       "      <td>1.617634</td>\n",
       "      <td>opea_project_github_io_latest_introduction_ind...</td>\n",
       "      <td>5f171c27-f29b-4e64-af91-5121f6540277</td>\n",
       "      <td>## OPEA Overview\\n\\nOPEA (Open P l atform for ...</td>\n",
       "      <td>cde6282cd9de29d6826210b3126718b887922ea3bb24d3...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>opea_project_github_io_latest_introduction_ind...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>66</td>\n",
       "      <td>16225671439394140203</td>\n",
       "      <td>pdf</td>\n",
       "      <td>45a605c585431f959465bebf225bf5c4a52b5ee9f466bd...</td>\n",
       "      <td>4384</td>\n",
       "      <td>2025-08-20T23:11:56.727852</td>\n",
       "      <td>1.617634</td>\n",
       "      <td>opea_project_github_io_latest_introduction_ind...</td>\n",
       "      <td>5f171c27-f29b-4e64-af91-5121f6540277</td>\n",
       "      <td>## OPEA Project Architecture\\n\\nOPEA uses micr...</td>\n",
       "      <td>61356a4e7c4841526469d3747e4f23d23196fa917bebf1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>opea_project_github_io_latest_introduction_ind...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>66</td>\n",
       "      <td>16225671439394140203</td>\n",
       "      <td>pdf</td>\n",
       "      <td>45a605c585431f959465bebf225bf5c4a52b5ee9f466bd...</td>\n",
       "      <td>4384</td>\n",
       "      <td>2025-08-20T23:11:56.727852</td>\n",
       "      <td>1.617634</td>\n",
       "      <td>opea_project_github_io_latest_introduction_ind...</td>\n",
       "      <td>5f171c27-f29b-4e64-af91-5121f6540277</td>\n",
       "      <td>## Microservices: Flexible and Scalable Archit...</td>\n",
       "      <td>da0da7937f25781dbf6b9f065125a2252f5f36a6f28437...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>opea_project_github_io_latest_introduction_ind...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>66</td>\n",
       "      <td>16225671439394140203</td>\n",
       "      <td>pdf</td>\n",
       "      <td>45a605c585431f959465bebf225bf5c4a52b5ee9f466bd...</td>\n",
       "      <td>4384</td>\n",
       "      <td>2025-08-20T23:11:56.727852</td>\n",
       "      <td>1.617634</td>\n",
       "      <td>opea_project_github_io_latest_introduction_ind...</td>\n",
       "      <td>5f171c27-f29b-4e64-af91-5121f6540277</td>\n",
       "      <td>## Megaservices: A Comprehensive Solution\\n\\nM...</td>\n",
       "      <td>2c1231af1fa993080f8dd37d3d57f5d6e8e13cfb8f646a...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>opea_project_github_io_latest_introduction_ind...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>66</td>\n",
       "      <td>16225671439394140203</td>\n",
       "      <td>pdf</td>\n",
       "      <td>45a605c585431f959465bebf225bf5c4a52b5ee9f466bd...</td>\n",
       "      <td>4384</td>\n",
       "      <td>2025-08-20T23:11:56.727852</td>\n",
       "      <td>1.617634</td>\n",
       "      <td>opea_project_github_io_latest_introduction_ind...</td>\n",
       "      <td>5f171c27-f29b-4e64-af91-5121f6540277</td>\n",
       "      <td>## Gateways: Customized Access to Mega- and Mi...</td>\n",
       "      <td>175e149a7582cff182a5c157e6f557b0086f49630671cb...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>opea_project_github_io_latest_introduction_ind...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>66</td>\n",
       "      <td>16225671439394140203</td>\n",
       "      <td>pdf</td>\n",
       "      <td>45a605c585431f959465bebf225bf5c4a52b5ee9f466bd...</td>\n",
       "      <td>4384</td>\n",
       "      <td>2025-08-20T23:11:56.727852</td>\n",
       "      <td>1.617634</td>\n",
       "      <td>opea_project_github_io_latest_introduction_ind...</td>\n",
       "      <td>5f171c27-f29b-4e64-af91-5121f6540277</td>\n",
       "      <td>## Next Step\\n\\nLinks to:\\n\\n- Getting Started...</td>\n",
       "      <td>b96630a036a41df13c96117d2a02e645d305f2a8506e5a...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            filename  num_pages  num_tables  \\\n",
       "0  opea_project_github_io_latest_introduction_ind...          1           0   \n",
       "1  opea_project_github_io_latest_introduction_ind...          1           0   \n",
       "2  opea_project_github_io_latest_introduction_ind...          1           0   \n",
       "3  opea_project_github_io_latest_introduction_ind...          1           0   \n",
       "4  opea_project_github_io_latest_introduction_ind...          1           0   \n",
       "5  opea_project_github_io_latest_introduction_ind...          1           0   \n",
       "6  opea_project_github_io_latest_introduction_ind...          1           0   \n",
       "7  opea_project_github_io_latest_introduction_ind...          1           0   \n",
       "8  opea_project_github_io_latest_introduction_ind...          1           0   \n",
       "\n",
       "   num_doc_elements         document_hash  ext  \\\n",
       "0                66  16225671439394140203  pdf   \n",
       "1                66  16225671439394140203  pdf   \n",
       "2                66  16225671439394140203  pdf   \n",
       "3                66  16225671439394140203  pdf   \n",
       "4                66  16225671439394140203  pdf   \n",
       "5                66  16225671439394140203  pdf   \n",
       "6                66  16225671439394140203  pdf   \n",
       "7                66  16225671439394140203  pdf   \n",
       "8                66  16225671439394140203  pdf   \n",
       "\n",
       "                                                hash  size  \\\n",
       "0  45a605c585431f959465bebf225bf5c4a52b5ee9f466bd...  4384   \n",
       "1  45a605c585431f959465bebf225bf5c4a52b5ee9f466bd...  4384   \n",
       "2  45a605c585431f959465bebf225bf5c4a52b5ee9f466bd...  4384   \n",
       "3  45a605c585431f959465bebf225bf5c4a52b5ee9f466bd...  4384   \n",
       "4  45a605c585431f959465bebf225bf5c4a52b5ee9f466bd...  4384   \n",
       "5  45a605c585431f959465bebf225bf5c4a52b5ee9f466bd...  4384   \n",
       "6  45a605c585431f959465bebf225bf5c4a52b5ee9f466bd...  4384   \n",
       "7  45a605c585431f959465bebf225bf5c4a52b5ee9f466bd...  4384   \n",
       "8  45a605c585431f959465bebf225bf5c4a52b5ee9f466bd...  4384   \n",
       "\n",
       "                date_acquired  document_convert_time  \\\n",
       "0  2025-08-20T23:11:56.727852               1.617634   \n",
       "1  2025-08-20T23:11:56.727852               1.617634   \n",
       "2  2025-08-20T23:11:56.727852               1.617634   \n",
       "3  2025-08-20T23:11:56.727852               1.617634   \n",
       "4  2025-08-20T23:11:56.727852               1.617634   \n",
       "5  2025-08-20T23:11:56.727852               1.617634   \n",
       "6  2025-08-20T23:11:56.727852               1.617634   \n",
       "7  2025-08-20T23:11:56.727852               1.617634   \n",
       "8  2025-08-20T23:11:56.727852               1.617634   \n",
       "\n",
       "                                     source_filename  \\\n",
       "0  opea_project_github_io_latest_introduction_ind...   \n",
       "1  opea_project_github_io_latest_introduction_ind...   \n",
       "2  opea_project_github_io_latest_introduction_ind...   \n",
       "3  opea_project_github_io_latest_introduction_ind...   \n",
       "4  opea_project_github_io_latest_introduction_ind...   \n",
       "5  opea_project_github_io_latest_introduction_ind...   \n",
       "6  opea_project_github_io_latest_introduction_ind...   \n",
       "7  opea_project_github_io_latest_introduction_ind...   \n",
       "8  opea_project_github_io_latest_introduction_ind...   \n",
       "\n",
       "                     source_document_id  \\\n",
       "0  5f171c27-f29b-4e64-af91-5121f6540277   \n",
       "1  5f171c27-f29b-4e64-af91-5121f6540277   \n",
       "2  5f171c27-f29b-4e64-af91-5121f6540277   \n",
       "3  5f171c27-f29b-4e64-af91-5121f6540277   \n",
       "4  5f171c27-f29b-4e64-af91-5121f6540277   \n",
       "5  5f171c27-f29b-4e64-af91-5121f6540277   \n",
       "6  5f171c27-f29b-4e64-af91-5121f6540277   \n",
       "7  5f171c27-f29b-4e64-af91-5121f6540277   \n",
       "8  5f171c27-f29b-4e64-af91-5121f6540277   \n",
       "\n",
       "                                            contents  \\\n",
       "0                                     <!-- image -->   \n",
       "1  ##  OPEA Overview\\n\\n-  OPEA Project Archite...   \n",
       "2  ## Getting Started with OPEA\\n\\nOPEA Tutoria l...   \n",
       "3  ## OPEA Overview\\n\\nOPEA (Open P l atform for ...   \n",
       "4  ## OPEA Project Architecture\\n\\nOPEA uses micr...   \n",
       "5  ## Microservices: Flexible and Scalable Archit...   \n",
       "6  ## Megaservices: A Comprehensive Solution\\n\\nM...   \n",
       "7  ## Gateways: Customized Access to Mega- and Mi...   \n",
       "8  ## Next Step\\n\\nLinks to:\\n\\n- Getting Started...   \n",
       "\n",
       "                                         document_id  \n",
       "0  e5dc819883d3c4b39a4a3f3054652d05bbceb23ae07f1d...  \n",
       "1  2976a73d5b22c5c85fe475b86408966158467b1fa7854d...  \n",
       "2  005ca58cb03214fec4e6447dd4ea411f9f300b3373e7db...  \n",
       "3  cde6282cd9de29d6826210b3126718b887922ea3bb24d3...  \n",
       "4  61356a4e7c4841526469d3747e4f23d23196fa917bebf1...  \n",
       "5  da0da7937f25781dbf6b9f065125a2252f5f36a6f28437...  \n",
       "6  2c1231af1fa993080f8dd37d3d57f5d6e8e13cfb8f646a...  \n",
       "7  175e149a7582cff182a5c157e6f557b0086f49630671cb...  \n",
       "8  b96630a036a41df13c96117d2a02e645d305f2a8506e5a...  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import glob\n",
    "\n",
    "df = pd.concat(\n",
    "    (pd.read_parquet(parquet_file)\n",
    "    for parquet_file in glob.glob(\"test-data/binary_output/*.parquet\")),\n",
    "    ignore_index=True\n",
    ")\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9aafb40b-8953-4001-9400-ae1461df67eb",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "binary_chain",
   "language": "python",
   "name": "binary_chain"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
